Practice exercises and activities based on the book "Interactive Data Visualization with Python: Present your data as an effective and compelling story," 2nd Edition
Book link: https://www.amazon.com/Interactive-Data-Visualization-Python-compelling/dp/1800200943
Date: September 2021
Author: Steven Ponce
# Load libraries
import sys
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
print('You\'re running python %s' % sys.version.split(' ')[0])
You're running python 3.8.3
from bokeh.resources import INLINE import bokeh.io from bokeh import * bokeh.io.output_notebook(INLINE)
# loading CO2 data
df_co2 = pd.read_csv('co2.csv')
df_co2.tail()
| country | 1800 | 1801 | 1802 | 1803 | 1804 | 1805 | 1806 | 1807 | 1808 | ... | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 187 | Venezuela | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 6.160 | 6.220 | 5.810 | 6.360 | 6.290 | 6.510 | 6.000 | 6.650 | 6.070 | 6.030 |
| 188 | Vietnam | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.160 | 1.210 | 1.220 | 1.360 | 1.470 | 1.610 | 1.700 | 1.570 | 1.610 | 1.800 |
| 189 | Yemen | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.974 | 1.010 | 0.964 | 0.999 | 1.070 | 0.993 | 0.811 | 0.749 | 0.997 | 0.865 |
| 190 | Zambia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.190 | 0.185 | 0.152 | 0.166 | 0.186 | 0.194 | 0.206 | 0.249 | 0.261 | 0.288 |
| 191 | Zimbabwe | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.832 | 0.796 | 0.742 | 0.573 | 0.406 | 0.552 | 0.665 | 0.530 | 0.776 | 0.780 |
5 rows × 216 columns
# loading gapminder data
gm = pd.read_csv('gm.csv', encoding=('ISO-8859-1'))
gm.tail()
| Country | Year | fertility | life | population | child_mortality | gdp | region | |
|---|---|---|---|---|---|---|---|---|
| 10106 | Åland | 2002 | NaN | 81.80 | 26257.0 | NaN | NaN | Europe & Central Asia |
| 10107 | Åland | 2003 | NaN | 80.63 | 26347.0 | NaN | NaN | Europe & Central Asia |
| 10108 | Åland | 2004 | NaN | 79.88 | 26530.0 | NaN | NaN | Europe & Central Asia |
| 10109 | Åland | 2005 | NaN | 80.00 | 26766.0 | NaN | NaN | Europe & Central Asia |
| 10110 | Åland | 2006 | NaN | 80.10 | 26923.0 | NaN | NaN | Europe & Central Asia |
gm.shape
(10111, 8)
# drop duplicates
df_gm = gm[['Country', 'region']].drop_duplicates()
df_gm.tail()
| Country | region | |
|---|---|---|
| 9901 | Western Sahara | Middle East & North Africa |
| 9951 | Yemen, Rep. | Middle East & North Africa |
| 10001 | Zambia | Sub-Saharan Africa |
| 10051 | Zimbabwe | Sub-Saharan Africa |
| 10101 | Åland | Europe & Central Asia |
df_gm.shape
(204, 2)
# merge the two DF
'''
You can use merge() any time when you want to do database-like join operations
'''
df_w_regions = pd.merge(df_co2, df_gm, left_on='country', right_on='Country', how='inner')
df_w_regions.tail()
| country | 1800 | 1801 | 1802 | 1803 | 1804 | 1805 | 1806 | 1807 | 1808 | ... | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | Country | region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 168 | Vanuatu | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.450 | 0.423 | 0.524 | 0.512 | 0.546 | 0.459 | 0.420 | 0.595 | Vanuatu | East Asia & Pacific |
| 169 | Venezuela | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5.810 | 6.360 | 6.290 | 6.510 | 6.000 | 6.650 | 6.070 | 6.030 | Venezuela | America |
| 170 | Vietnam | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.220 | 1.360 | 1.470 | 1.610 | 1.700 | 1.570 | 1.610 | 1.800 | Vietnam | East Asia & Pacific |
| 171 | Zambia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.152 | 0.166 | 0.186 | 0.194 | 0.206 | 0.249 | 0.261 | 0.288 | Zambia | Sub-Saharan Africa |
| 172 | Zimbabwe | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.742 | 0.573 | 0.406 | 0.552 | 0.665 | 0.530 | 0.776 | 0.780 | Zimbabwe | Sub-Saharan Africa |
5 rows × 218 columns
df_w_regions.shape
(173, 218)
# drop one of the country column
df_w_regions = df_w_regions.drop('Country', axis='columns')
df_w_regions.tail()
| country | 1800 | 1801 | 1802 | 1803 | 1804 | 1805 | 1806 | 1807 | 1808 | ... | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | region | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 168 | Vanuatu | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.222 | 0.450 | 0.423 | 0.524 | 0.512 | 0.546 | 0.459 | 0.420 | 0.595 | East Asia & Pacific |
| 169 | Venezuela | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 6.220 | 5.810 | 6.360 | 6.290 | 6.510 | 6.000 | 6.650 | 6.070 | 6.030 | America |
| 170 | Vietnam | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.210 | 1.220 | 1.360 | 1.470 | 1.610 | 1.700 | 1.570 | 1.610 | 1.800 | East Asia & Pacific |
| 171 | Zambia | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.185 | 0.152 | 0.166 | 0.186 | 0.194 | 0.206 | 0.249 | 0.261 | 0.288 | Sub-Saharan Africa |
| 172 | Zimbabwe | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.796 | 0.742 | 0.573 | 0.406 | 0.552 | 0.665 | 0.530 | 0.776 | 0.780 | Sub-Saharan Africa |
5 rows × 217 columns
df_w_regions.shape
(173, 217)
'''
Pandas.melt() unpivots a DataFrame from wide format to long format.
melt() function is useful to massage a DataFrame into a format where one or more columns are identifier variables,
while all other columns, considered measured variables, are unpivoted to the row axis, leaving just two non-identifier
columns, variable and value.
'''
# melt
new_co2 = pd.melt(df_w_regions, id_vars=['country', 'region'])
columns = ['country', 'region', 'year', 'co2']
new_co2.columns = columns
new_co2.tail()
| country | region | year | co2 | |
|---|---|---|---|---|
| 37190 | Vanuatu | East Asia & Pacific | 2014 | 0.595 |
| 37191 | Venezuela | America | 2014 | 6.030 |
| 37192 | Vietnam | East Asia & Pacific | 2014 | 1.800 |
| 37193 | Zambia | Sub-Saharan Africa | 2014 | 0.288 |
| 37194 | Zimbabwe | Sub-Saharan Africa | 2014 | 0.780 |
new_co2.shape
(37195, 4)
# set 1964 and onward as the range for year and int64 as data type.
# sort DF by country and then year
df_co2 = new_co2[new_co2['year'].astype('int64') > 1963]
df_co2['year'] = df_co2['year'].astype('int64')
df_co2 = df_co2.sort_values(by=['country', 'year'])
df_co2.head()
| country | region | year | co2 | |
|---|---|---|---|---|
| 28372 | Afghanistan | South Asia | 1964 | 0.0863 |
| 28545 | Afghanistan | South Asia | 1965 | 0.1010 |
| 28718 | Afghanistan | South Asia | 1966 | 0.1080 |
| 28891 | Afghanistan | South Asia | 1967 | 0.1240 |
| 29064 | Afghanistan | South Asia | 1968 | 0.1160 |
# create new DF called df_gdp
df_gdp = gm[['Country', 'Year','gdp']]
df_gdp.columns = ['country', 'year','gdp']
df_gdp.head()
| country | year | gdp | |
|---|---|---|---|
| 0 | Afghanistan | 1964 | 1182.0 |
| 1 | Afghanistan | 1965 | 1182.0 |
| 2 | Afghanistan | 1966 | 1168.0 |
| 3 | Afghanistan | 1967 | 1173.0 |
| 4 | Afghanistan | 1968 | 1187.0 |
df_gdp.shape
(10111, 3)
# Now we have two DF for CO2 and GDP
# Lets merge them together
data = pd.merge(df_co2, df_gdp, on=['country', 'year'], how='left')
data = data.dropna()
data.head()
| country | region | year | co2 | gdp | |
|---|---|---|---|---|---|
| 0 | Afghanistan | South Asia | 1964 | 0.0863 | 1182.0 |
| 1 | Afghanistan | South Asia | 1965 | 0.1010 | 1182.0 |
| 2 | Afghanistan | South Asia | 1966 | 0.1080 | 1168.0 |
| 3 | Afghanistan | South Asia | 1967 | 0.1240 | 1173.0 |
| 4 | Afghanistan | South Asia | 1968 | 0.1160 | 1187.0 |
data.shape
(8202, 5)
# create a numoy array of the co2 and gdp columnms
np_co2 = np.array(data['co2'])
np_gdp = np.array(data['gdp'])
# correlation between co2 and gpd
''''Return Pearson product-moment correlation coefficients.'''
corr = np.corrcoef(np_co2, np_gdp)
corr[1,0]
0.7821973088777349
# import libraries
from bokeh.io import curdoc, output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, Slider
from bokeh.palettes import Spectral6
from bokeh.layouts import widgetbox, row, column
output_notebook()
# color code the datapoints
regions_list = data.region.unique().tolist()
regions_list
['South Asia', 'Europe & Central Asia', 'Middle East & North Africa', 'Sub-Saharan Africa', 'America', 'East Asia & Pacific']
# assigning a color from spectral16
color_mapper = CategoricalColorMapper(factors=regions_list, palette=Spectral6)
# data source for the plot
source = ColumnDataSource(data={
'x': data.gdp[data['year'] == 1964],
'y': data.co2[data['year'] == 1964],
'country': data.country[data['year'] == 1964],
'region': data.region[data['year'] == 1964],
})
# store the min and max GDP values
xmin, xmax = min(data.gdp), max(data.gdp)
# store the min and max co2 values
ymin, ymax = min(data.co2), max(data.co2)
# Create the figure: plot
plot = figure(title='CO2 Emissions vs GDP in 1964',
plot_height=600, plot_width=950,
x_range=(xmin, xmax),
y_range=(ymin, ymax), y_axis_type='log')
# add cicular glyphs to the plot
plot.circle(x='x',
y='y',
fill_alpha=0.9,
source=source,
legend_group='region',
color=dict(field='region', transform=color_mapper),
size=10)
# Set the legend.location attribute of the plot
plot.legend.location = 'bottom_right'
# Set the x-axis label
plot.xaxis.axis_label = 'Income Per Person'
# Set the y-axis label
plot.yaxis.axis_label = 'CO2 Emissions (tons per person)'
from bokeh.models import PrintfTickFormatter
plot.xaxis[0].formatter = PrintfTickFormatter(format="%1f")
show(plot)
# adding a slider
start = min(data.year)
end = max(data.year)
value = min(data.year)
slider = Slider(start=start, end=end, step=1, value=value, title='Year')
# Define the callback: update_plot
def update_plot(attr, old, new):
# set the `yr` name to `slider.value` and `source.data = new_data`
yr = slider.value
new_data = {
'x': data.gdp[data['year'] == yr],
'y': data.co2[data['year'] == yr],
'country': data.country[data['year'] == yr],
'region': data.region[data['year'] == yr],
}
source.data = new_data
# Add title to figure: plot.title.tex
plot.title.text = 'CO2 Emissios vs. GDP in %d' %yr
data.keys()
Index(['country', 'region', 'year', 'co2', 'gdp'], dtype='object')
# Attach the callback to the 'value' property of slider
slider.on_change('value', update_plot)
# layout = row(widgetbox(slider), plot) #### widgetbox is deprecated
layout = column(slider)
curdoc().add_root(layout)
show(layout)
WARNING:bokeh.embed.util:
You are generating standalone HTML/JS output, but trying to use real Python
callbacks (i.e. with on_change or on_event). This combination cannot work.
Only JavaScript callbacks may be used with standalone output. For more
information on JavaScript callbacks with Bokeh, see:
https://docs.bokeh.org/en/latest/docs/user_guide/interaction/callbacks.html
Alternatively, to use real Python callbacks, a Bokeh server application may
be used. For more information on building and running Bokeh applications, see:
https://docs.bokeh.org/en/latest/docs/user_guide/server.html
# adding a hoover tool
hover = HoverTool(tooltips=[('Country', '@country'), ('GDP', '@x'),
('CO2 Emission', '@y')])
plot.add_tools(hover)
show(layout)
show(plot)
WARNING:bokeh.embed.util:
You are generating standalone HTML/JS output, but trying to use real Python
callbacks (i.e. with on_change or on_event). This combination cannot work.
Only JavaScript callbacks may be used with standalone output. For more
information on JavaScript callbacks with Bokeh, see:
https://docs.bokeh.org/en/latest/docs/user_guide/interaction/callbacks.html
Alternatively, to use real Python callbacks, a Bokeh server application may
be used. For more information on building and running Bokeh applications, see:
https://docs.bokeh.org/en/latest/docs/user_guide/server.html
import pandas as pd
import plotly.express as px
# CO2 and GDP data
data.head()
| country | region | year | co2 | gdp | |
|---|---|---|---|---|---|
| 0 | Afghanistan | South Asia | 1964 | 0.0863 | 1182.0 |
| 1 | Afghanistan | South Asia | 1965 | 0.1010 | 1182.0 |
| 2 | Afghanistan | South Asia | 1966 | 0.1080 | 1168.0 |
| 3 | Afghanistan | South Asia | 1967 | 0.1240 | 1173.0 |
| 4 | Afghanistan | South Asia | 1968 | 0.1160 | 1187.0 |
# store the min and max GDP values
xmin, xmax = min(data.gdp), max(data.gdp)
# store the min and max co2 values
ymin, ymax = min(data.co2), max(data.co2)
# scatter plot
fig = px.scatter(data,
x='gdp',
y='co2',
animation_frame='year',
animation_group='country',
color='region',
hover_name='country',
facet_col='region',
width=1550, height=400,
log_x=True,
size_max=45,
range_x=[xmin, xmax],
range_y=[ymin, ymax]
)
fig.show()
import pandas as pd
import plotly.express as px
# CO2 and GDP data from previous excercise
data.tail()
| country | region | year | co2 | gdp | |
|---|---|---|---|---|---|
| 8817 | Zimbabwe | Sub-Saharan Africa | 2009 | 0.406 | 1352.0 |
| 8818 | Zimbabwe | Sub-Saharan Africa | 2010 | 0.552 | 1484.0 |
| 8819 | Zimbabwe | Sub-Saharan Africa | 2011 | 0.665 | 1626.0 |
| 8820 | Zimbabwe | Sub-Saharan Africa | 2012 | 0.530 | 1750.0 |
| 8821 | Zimbabwe | Sub-Saharan Africa | 2013 | 0.776 | 1773.0 |
import plotly.express as px
# scatter plot & box plot using px
fig1 = px.scatter(data, x=data.year,
y=data.co2,
color=data.region,
marginal_y='box')
fig1.show()
# rug plot
fig2 = px.scatter(data,
x=data.gdp,
y=data.co2,
color=data.region,
marginal_x='rug',
marginal_y='box',
animation_frame=data.year,
animation_group=data.country,
)
fig2.show()
# density contour plot
fig3 = px.density_contour(data,
x=data.gdp,
y=data.co2,
color=data.region,
marginal_x='rug',
marginal_y='box',
animation_frame=data.year,
animation_group=data.country,
)
fig3.show()
data.keys()
Index(['country', 'region', 'year', 'co2', 'gdp'], dtype='object')